#input for this is the output of mwu_to_cluster.py - tab delimited txt file with last column being normalized fitness values
#this will be for k nearest neighbor analysis
#python fit_to_array_for_knn.py compound.txt compound.txt compoun.txt geneofinterest 
#needs to output an array
#This version uses all genes
#I need to make each gene its own category
#only non-txt file entered should be gene of interest

import sys
import numpy as numpy
import matplotlib.pyplot as plt 
from sklearn import neighbors

#put data in array format

y = []
X = []
genenames = {}
unknown = []

#first value in dictionary is unknown
for arg in sys.argv:
	if arg.endswith('.txt') == True:
		for line in open(arg):
			split = line.split('\t')
			name = split[0]
			fitval = float(split[5].rstrip())
			if name not in genenames:
				genenames[name]=[fitval]

			#print split[0]
			else:
				genenames[name].append(fitval)
	else:
		geneofinterest = arg

#print genenames
for k,v in genenames.iteritems():
	if k == geneofinterest:
		unknown.append(v)
	else:
		X.append(v)
		y.append(k)
#print X

#fit data wiht K nearest neighbors

n_neighbors = 5 #this is default. Iris example uses 15
h=0.2 #this is the step size in mesh - can change it to figure out what it does

#see http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html for different options here
for weights in ['uniform','distance']:
    neigh = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
    neigh.fit(X,y)

#predict for unknown
print neigh.predict(unknown)
print neigh.score(X,y)
for i in neigh.predict_proba(unknown):
	listofclosestneighbors = i
	for i in listofclosestneighbors:
		print i